import polars as pl
import duckdb
import skimpy
import folium
from plotnine import (
ggplot, aes,
geom_point, geom_boxplot,
scale_y_log10, scale_x_log10,
theme, coord_flip
)Delivery Standardization
Exploratory Data Analysis | IS 6813
# Setup DuckDB connection
con = duckdb.connect()
# Reference CSVs
cust_addr_zip_map = 'data/customer_address_and_zip_mapping.csv'
customer_profile = 'data/customer_profile.csv'
transaction_data = 'data/transactional_data.csv'
delivery_cost = 'data/delivery_cost_data.xlsx'# For reading Excel documents
con.execute("""
INSTALL spatial;
LOAD spatial;
""");# Load CSVs to persistent tables
con.execute(f"""
CREATE TABLE cust_addr AS (
SELECT * FROM read_csv_auto('{cust_addr_zip_map}')
);
CREATE TABLE cust_profile AS (
SELECT * FROM read_csv_auto('{customer_profile}')
);
CREATE TABLE transactions AS (
SELECT * FROM read_csv_auto('{transaction_data}')
);
CREATE TABLE delivery_cost AS (
SELECT * FROM st_read('{delivery_cost}')
);
""")<duckdb.duckdb.DuckDBPyConnection at 0x7a8f5b5083f0>
con.sql("SHOW TABLES")┌───────────────┐
│ name │
│ varchar │
├───────────────┤
│ cust_addr │
│ cust_profile │
│ delivery_cost │
│ transactions │
└───────────────┘
customer_profile.csv
We can now use SQL to query the cust_profile table. In the case where we need some programming, we can just create a polars dataframe:
cust_profile_df = con.sql("FROM cust_profile").pl()Let’s first look at the columns:
con.sql("DESCRIBE cust_profile")┌──────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │ column_type │ null │ key │ default │ extra │
│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │
├──────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ CUSTOMER_NUMBER │ BIGINT │ YES │ NULL │ NULL │ NULL │
│ PRIMARY_GROUP_NUMBER │ BIGINT │ YES │ NULL │ NULL │ NULL │
│ FREQUENT_ORDER_TYPE │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ FIRST_DELIVERY_DATE │ DATE │ YES │ NULL │ NULL │ NULL │
│ ON_BOARDING_DATE │ DATE │ YES │ NULL │ NULL │ NULL │
│ COLD_DRINK_CHANNEL │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ TRADE_CHANNEL │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ SUB_TRADE_CHANNEL │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ LOCAL_MARKET_PARTNER │ BOOLEAN │ YES │ NULL │ NULL │ NULL │
│ CO2_CUSTOMER │ BOOLEAN │ YES │ NULL │ NULL │ NULL │
│ ZIP_CODE │ BIGINT │ YES │ NULL │ NULL │ NULL │
├──────────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┤
│ 11 rows 6 columns │
└────────────────────────────────────────────────────────────────────────────┘
11 columns. All the data types seem reasonable. Let’s assess sparcity of the data:
cust_profile_df.null_count()| CUSTOMER_NUMBER | PRIMARY_GROUP_NUMBER | FREQUENT_ORDER_TYPE | FIRST_DELIVERY_DATE | ON_BOARDING_DATE | COLD_DRINK_CHANNEL | TRADE_CHANNEL | SUB_TRADE_CHANNEL | LOCAL_MARKET_PARTNER | CO2_CUSTOMER | ZIP_CODE |
|---|---|---|---|---|---|---|---|---|---|---|
| u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 |
| 0 | 18196 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
We have mostly categorical features in here. Let’s determine some of the distributions of these, such as…
How many groups/individual customers are there? (remember, many individual customers could roll up to a group)
con.sql("""
SELECT
COUNT(DISTINCT COALESCE(PRIMARY_GROUP_NUMBER, CUSTOMER_NUMBER)) AS TOT_CUST
FROM cust_profile
""")┌──────────┐
│ TOT_CUST │
│ int64 │
├──────────┤
│ 19216 │
└──────────┘
There’s a few dozen groups with many customers. Most are single customers, however.
con.sql("""
SELECT
COALESCE(PRIMARY_GROUP_NUMBER, CUSTOMER_NUMBER) AS CUST_ID
,COUNT(*)
FROM cust_profile
GROUP BY ALL
ORDER BY COUNT(*) DESC
""")┌───────────┬──────────────┐
│ CUST_ID │ count_star() │
│ int64 │ int64 │
├───────────┼──────────────┤
│ 437 │ 670 │
│ 2487 │ 616 │
│ 340 │ 558 │
│ 1194 │ 365 │
│ 2183 │ 307 │
│ 1431 │ 237 │
│ 404 │ 219 │
│ 104 │ 177 │
│ 1594 │ 159 │
│ 384 │ 158 │
│ · │ · │
│ · │ · │
│ · │ · │
│ 600569534 │ 1 │
│ 501625173 │ 1 │
│ 500636951 │ 1 │
│ 500944185 │ 1 │
│ 501213180 │ 1 │
│ 501681374 │ 1 │
│ 600083643 │ 1 │
│ 600569505 │ 1 │
│ 600069200 │ 1 │
│ 501487334 │ 1 │
├───────────┴──────────────┤
│ ? rows 2 columns │
└──────────────────────────┘
What kind of order frequency is seen across these?
con.sql("""
SELECT FREQUENT_ORDER_TYPE, COUNT(*) AS VOL
FROM cust_profile
GROUP BY FREQUENT_ORDER_TYPE
ORDER BY COUNT(*) DESC
""")┌─────────────────────┬───────┐
│ FREQUENT_ORDER_TYPE │ VOL │
│ varchar │ int64 │
├─────────────────────┼───────┤
│ SALES REP │ 20017 │
│ OTHER │ 5378 │
│ MYCOKE360 │ 2386 │
│ CALL CENTER │ 1375 │
│ MYCOKE LEGACY │ 981 │
│ EDI │ 341 │
└─────────────────────┴───────┘
con.sql("""
SELECT COLD_DRINK_CHANNEL, COUNT(*) AS VOL
FROM cust_profile
GROUP BY COLD_DRINK_CHANNEL
ORDER BY COUNT(*) DESC
""")┌────────────────────┬───────┐
│ COLD_DRINK_CHANNEL │ VOL │
│ varchar │ int64 │
├────────────────────┼───────┤
│ DINING │ 15518 │
│ GOODS │ 5829 │
│ EVENT │ 3079 │
│ PUBLIC SECTOR │ 1740 │
│ BULK TRADE │ 1323 │
│ ACCOMMODATION │ 1241 │
│ WORKPLACE │ 1201 │
│ WELLNESS │ 490 │
│ CONVENTIONAL │ 57 │
└────────────────────┴───────┘
We see mostly local market partners.
con.sql("""
SELECT LOCAL_MARKET_PARTNER, COUNT(*) AS VOL
FROM cust_profile
GROUP BY LOCAL_MARKET_PARTNER
ORDER BY COUNT(*) DESC
""")┌──────────────────────┬───────┐
│ LOCAL_MARKET_PARTNER │ VOL │
│ boolean │ int64 │
├──────────────────────┼───────┤
│ true │ 27355 │
│ false │ 3123 │
└──────────────────────┴───────┘
However, if we intersect by group, there may be more to the story:
con.sql("""
SELECT
LOCAL_MARKET_PARTNER,
SUM(CASE WHEN PRIMARY_GROUP_NUMBER IS NULL THEN 1 ELSE 0 END) AS CUST,
SUM(CASE WHEN PRIMARY_GROUP_NUMBER IS NULL THEN 0 ELSE 1 END) AS GROUP
FROM cust_profile
GROUP BY LOCAL_MARKET_PARTNER
""")┌──────────────────────┬────────┬────────┐
│ LOCAL_MARKET_PARTNER │ CUST │ GROUP │
│ boolean │ int128 │ int128 │
├──────────────────────┼────────┼────────┤
│ true │ 17597 │ 9758 │
│ false │ 599 │ 2524 │
└──────────────────────┴────────┴────────┘
Solo customers are far more likely to be local market partners (which makes sense).
What is the distribution of buying CO2?
con.sql("""
SELECT CO2_CUSTOMER,
SUM(CASE WHEN PRIMARY_GROUP_NUMBER IS NULL THEN 1 ELSE 0 END) AS CUST,
SUM(CASE WHEN PRIMARY_GROUP_NUMBER IS NULL THEN 0 ELSE 1 END) AS GROUP
FROM cust_profile
GROUP BY CO2_CUSTOMER
""")┌──────────────┬────────┬────────┐
│ CO2_CUSTOMER │ CUST │ GROUP │
│ boolean │ int128 │ int128 │
├──────────────┼────────┼────────┤
│ true │ 9456 │ 2526 │
│ false │ 8740 │ 9756 │
└──────────────┴────────┴────────┘
It’s about 50-50, but we see that “franchises” are far less likely to source their CO2 from Swire. So how valuable is that business?
customer_address_and_zip_mapping.csv
Let’s create our polars data frame:
cust_addr_df = con.sql("FROM cust_addr").pl()And now look at the columns:
con.sql("DESCRIBE cust_addr")┌──────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │ column_type │ null │ key │ default │ extra │
│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │
├──────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ zip │ BIGINT │ YES │ NULL │ NULL │ NULL │
│ full address │ VARCHAR │ YES │ NULL │ NULL │ NULL │
└──────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘
We need to pull out those full addresses. We can do that by splitting the string, grabbing list elements of interest, and then properly casting.
con.execute("""
CREATE TABLE cust_addr_detail AS (
SELECT
zip
,LIST_ELEMENT(STRING_SPLIT("full address", ','), 2) AS city
,LIST_ELEMENT(STRING_SPLIT("full address", ','), 3) AS state
,LIST_ELEMENT(STRING_SPLIT("full address", ','), 4) AS state_abbr
,LIST_ELEMENT(STRING_SPLIT("full address", ','), 5) AS county
,CAST(LIST_ELEMENT(STRING_SPLIT("full address", ','), 7) AS DOUBLE) AS lat
,CAST(LIST_ELEMENT(STRING_SPLIT("full address", ','), 8) AS DOUBLE) AS lon
FROM cust_addr
)
""")<duckdb.duckdb.DuckDBPyConnection at 0x7a8f5b5083f0>
con.sql("DESCRIBE cust_addr_detail")┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │ column_type │ null │ key │ default │ extra │
│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │
├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ zip │ BIGINT │ YES │ NULL │ NULL │ NULL │
│ city │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ state │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ state_abbr │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ county │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ lat │ DOUBLE │ YES │ NULL │ NULL │ NULL │
│ lon │ DOUBLE │ YES │ NULL │ NULL │ NULL │
└─────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘
Looks great! Let’s join and see where most customers are by state.
con.sql("""
SELECT
cad.state
,COUNT(*)
FROM cust_profile cp
INNER JOIN cust_addr_detail cad ON cad.zip = cp.ZIP_CODE
GROUP BY cad.state
""")┌───────────────┬──────────────┐
│ state │ count_star() │
│ varchar │ int64 │
├───────────────┼──────────────┤
│ Louisiana │ 390 │
│ Maryland │ 4901 │
│ Kansas │ 7163 │
│ Kentucky │ 6991 │
│ Massachusetts │ 11033 │
└───────────────┴──────────────┘
It appears that most customers are found in Massachusetts. Let’s see if we can’t render a map:
cust_addr = con.sql("SELECT * FROM cust_addr_detail").pl()swire_map = folium.Map(
location = [
cust_addr['lat'].mean(),
cust_addr['lon'].mean()
],
zoom_start = 4.5,
control_scale = True
)for row in cust_addr.iter_rows():
folium.Marker(
location = [row[5], row[6]],
icon = folium.Icon(color = "red")
).add_to(swire_map)swire_mapThere is somewhat more concentraction around city centers, but not near what I would expect. Seems curious.
transactional_data.csv
Let’s create our polars data frame:
transac_df = con.sql("FROM transactions").pl()And now look at the columns:
con.sql("DESCRIBE transactions")┌───────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │ column_type │ null │ key │ default │ extra │
│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │
├───────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ TRANSACTION_DATE │ DATE │ YES │ NULL │ NULL │ NULL │
│ WEEK │ BIGINT │ YES │ NULL │ NULL │ NULL │
│ YEAR │ BIGINT │ YES │ NULL │ NULL │ NULL │
│ CUSTOMER_NUMBER │ BIGINT │ YES │ NULL │ NULL │ NULL │
│ ORDER_TYPE │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ ORDERED_CASES │ DOUBLE │ YES │ NULL │ NULL │ NULL │
│ LOADED_CASES │ DOUBLE │ YES │ NULL │ NULL │ NULL │
│ DELIVERED_CASES │ DOUBLE │ YES │ NULL │ NULL │ NULL │
│ ORDERED_GALLONS │ DOUBLE │ YES │ NULL │ NULL │ NULL │
│ LOADED_GALLONS │ DOUBLE │ YES │ NULL │ NULL │ NULL │
│ DELIVERED_GALLONS │ DOUBLE │ YES │ NULL │ NULL │ NULL │
├───────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┤
│ 11 rows 6 columns │
└─────────────────────────────────────────────────────────────────────────┘
Let’s look at the summary statistics here:
transac_df.describe()| statistic | TRANSACTION_DATE | WEEK | YEAR | CUSTOMER_NUMBER | ORDER_TYPE | ORDERED_CASES | LOADED_CASES | DELIVERED_CASES | ORDERED_GALLONS | LOADED_GALLONS | DELIVERED_GALLONS |
|---|---|---|---|---|---|---|---|---|---|---|---|
| str | str | f64 | f64 | f64 | str | f64 | f64 | f64 | f64 | f64 | f64 |
| "count" | "1045540" | 1.04554e6 | 1.04554e6 | 1.04554e6 | "1045540" | 1.04554e6 | 1.04554e6 | 1.04554e6 | 1.04554e6 | 1.04554e6 | 1.04554e6 |
| "null_count" | "0" | 0.0 | 0.0 | 0.0 | "0" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| "mean" | "2023-12-27 11:40:07.138000" | 26.228599 | 2023.496534 | 5.4664e8 | null | 26.851646 | 25.922583 | 25.133342 | 9.873689 | 9.604892 | 9.208331 |
| "std" | null | 14.516752 | 0.499988 | 4.9427e7 | null | 126.762052 | 122.787259 | 121.515626 | 26.469447 | 25.649953 | 25.175433 |
| "min" | "2023-01-01" | 1.0 | 2023.0 | 5.00245678e8 | "CALL CENTER" | 0.0 | 0.0 | -3132.0 | 0.0 | 0.0 | -1792.5 |
| "25%" | "2023-06-30" | 14.0 | 2023.0 | 5.0109192e8 | null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| "50%" | "2023-12-27" | 26.0 | 2023.0 | 5.01548213e8 | null | 7.0 | 7.0 | 6.0 | 0.0 | 0.0 | 0.0 |
| "75%" | "2024-06-25" | 38.0 | 2024.0 | 6.00080939e8 | null | 18.5 | 18.0 | 17.3332 | 12.5 | 12.5 | 12.5 |
| "max" | "2024-12-31" | 52.0 | 2024.0 | 6.00975408e8 | "null" | 8479.888 | 8171.564 | 8069.483 | 2562.5 | 2562.5 | 2292.5 |
No missing data, which is very good. Most customers aren’t ordering anything or very little on a per transaction date basis. Let’s evaluate on an annual basis.
con.sql("""
SELECT
YEAR
,CUSTOMER_NUMBER
,SUM(ORDERED_CASES) + SUM(ORDERED_GALLONS) AS ORDERED_QTY
FROM transactions
GROUP BY
YEAR, CUSTOMER_NUMBER
""").pl().describe()| statistic | YEAR | CUSTOMER_NUMBER | ORDERED_QTY |
|---|---|---|---|
| str | f64 | f64 | f64 |
| "count" | 55653.0 | 55653.0 | 55653.0 |
| "null_count" | 0.0 | 0.0 | 0.0 |
| "mean" | 2023.531885 | 5.4083e8 | 689.950353 |
| "std" | 0.498987 | 4.8565e7 | 5111.046976 |
| "min" | 2023.0 | 5.00245678e8 | 0.0 |
| "25%" | 2023.0 | 5.01127715e8 | 67.0 |
| "50%" | 2024.0 | 5.01542171e8 | 170.0 |
| "75%" | 2024.0 | 6.00077803e8 | 457.5 |
| "max" | 2024.0 | 6.00975408e8 | 459175.7404 |
We get our first glimpse at why there’s the annual 400 gallon threshold. Over 2/3 aren’t ordering that much. Woof.
annual_data = con.sql("""
SELECT
t.CUSTOMER_NUMBER
,cp.COLD_DRINK_CHANNEL
,t.YEAR
,date_diff('week', ON_BOARDING_DATE, FIRST_DELIVERY_DATE) AS WK_RAMP_UP
,date_diff('year', FIRST_DELIVERY_DATE, current_date) AS YR_TENURE
,SUM(t.ORDERED_CASES) + SUM(t.ORDERED_GALLONS) AS TOTAL_ORDERED
FROM transactions t
INNER JOIN cust_profile cp ON cp.CUSTOMER_NUMBER = t.CUSTOMER_NUMBER
GROUP BY ALL
""").pl()annual_data.shape(55653, 6)
(
ggplot(annual_data)
+ geom_point(
aes("WK_RAMP_UP", "TOTAL_ORDERED", color="COLD_DRINK_CHANNEL"),
alpha = 0.75
)
+ scale_y_log10()
+ scale_x_log10()
)/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log10
/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: invalid value encountered in log10
/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/plotnine/layer.py:364: PlotnineWarning: geom_point : Removed 1 rows containing missing values.
(
ggplot(annual_data)
+ geom_point(
aes("YR_TENURE", "TOTAL_ORDERED", color="COLD_DRINK_CHANNEL"),
alpha = 0.75
)
+ scale_y_log10()
+ scale_x_log10()
)/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log10
delivery_cost_data.xlsx
Let’s create our polars data frame:
deliv_df = con.sql("FROM delivery_cost").pl()And now look at the columns:
con.sql("DESCRIBE delivery_cost")┌──────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │ column_type │ null │ key │ default │ extra │
│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │
├──────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ Cold Drink Channel │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ Vol Range │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ Applicable To │ VARCHAR │ YES │ NULL │ NULL │ NULL │
│ Median Delivery Cost │ DOUBLE │ YES │ NULL │ NULL │ NULL │
│ Cost Type │ VARCHAR │ YES │ NULL │ NULL │ NULL │
└──────────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘
Let’s intersect this data with a particular customer and their orders.
cost_data = con.sql("""
WITH
delivery AS (
SELECT *
,LIST_ELEMENT(STRING_SPLIT(d."Vol Range", ' - '), 1) AS MIN_COST
,LIST_ELEMENT(STRING_SPLIT(d."Vol Range", ' - '), 2) AS MAX_COST
FROM delivery_cost d
),
delivery_cleaned AS (
SELECT *
,CAST((CASE
WHEN MIN_COST LIKE '%+' THEN trim(MIN_COST, '+')
ELSE MIN_COST END
) AS INT) AS MIN_COST_VOL
,COALESCE(CAST(MAX_COST AS INT), 2147483647) AS MAX_COST_VOL
FROM delivery
),
joined AS (
SELECT
t.YEAR
,t.CUSTOMER_NUMBER
,c.COLD_DRINK_CHANNEL
,c.FREQUENT_ORDER_TYPE
,SUM(CASE WHEN ORDERED_CASES > 0 OR ORDERED_GALLONS > 0 THEN 1 ELSE 0 END) AS ORDER_CNT
,SUM(ORDERED_CASES) AS ANNUAL_CASES
,SUM(ORDERED_GALLONS) AS ANNUAL_GALLONS
FROM transactions t
INNER JOIN cust_profile c ON c.CUSTOMER_NUMBER = t.CUSTOMER_NUMBER
GROUP BY ALL
),
formatted AS (
SELECT
j.*
,d2."Median Delivery Cost" AS COST_CASES
,d1."Median Delivery Cost" AS COST_GALLONS
,(d1."Median Delivery Cost" * j.ANNUAL_GALLONS) + (d2."Median Delivery Cost" * j.ANNUAL_CASES) AS DELIVERY_COST
,((d1."Median Delivery Cost" * j.ANNUAL_GALLONS) + (d2."Median Delivery Cost" * j.ANNUAL_CASES)) / ORDER_CNT AS DELIVERY_COST_PER_ORDER
FROM joined j
LEFT JOIN delivery_cleaned d1 ON d1."Cold Drink Channel" = j.COLD_DRINK_CHANNEL
AND d1."Applicable To" = 'Fountain'
AND j.ANNUAL_GALLONS BETWEEN d1.MIN_COST_VOL AND d1.MAX_COST_VOL
LEFT JOIN delivery_cleaned d2 ON d2."Cold Drink Channel" = j.COLD_DRINK_CHANNEL
AND d2."Applicable To" = 'Bottles and Cans'
AND j.ANNUAL_CASES BETWEEN d2.MIN_COST_VOL AND d2.MAX_COST_VOL
)
FROM formatted
""").pl()(
ggplot(cost_data)
+ geom_boxplot(
aes(x="COLD_DRINK_CHANNEL", y="DELIVERY_COST")
)
+ scale_y_log10()
+ coord_flip()
)/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log10
/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/plotnine/layer.py:284: PlotnineWarning: stat_boxplot : Removed 441 rows containing non-finite values.
(
ggplot(cost_data)
+ geom_boxplot(
aes(x="FREQUENT_ORDER_TYPE", y="DELIVERY_COST")
)
+ scale_y_log10()
+ coord_flip()
)/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log10
/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/plotnine/layer.py:284: PlotnineWarning: stat_boxplot : Removed 441 rows containing non-finite values.
con.sql("""
SELECT
SUM(CASE WHEN DELIVERED_GALLONS < 0 OR DELIVERED_CASES < 0 THEN 1 ELSE 0 END)
,SUM(CASE WHEN DELIVERED_GALLONS < 0 OR DELIVERED_CASES < 0 THEN 1 ELSE 0 END) / COUNT(*)
,COUNT(*)
FROM transactions t
""")┌──────────────────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────┐
│ sum(CASE WHEN (((DELIVERED_GALLONS < 0) OR (DELIVERED_CASES < 0))) THEN (1) ELSE 0 END) │ (sum(CASE WHEN (((DELIVERED_GALLONS < 0) OR (DELIVERED_CASES < 0))) THEN (1) ELSE 0 END) / count_star()) │ count_star() │
│ int128 │ double │ int64 │
├──────────────────────────────────────────────────────────────────────────────────────────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────┼──────────────┤
│ 4221 │ 0.004037148267880712 │ 1045540 │
└──────────────────────────────────────────────────────────────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┘
con.sql("""
SELECT
COUNT(DISTINCT COALESCE(cp.PRIMARY_GROUP_NUMBER, t.CUSTOMER_NUMBER))
FROM transactions t
INNER JOIN cust_profile cp ON cp.CUSTOMER_NUMBER = t.CUSTOMER_NUMBER
WHERE DELIVERED_GALLONS < 0 OR DELIVERED_CASES < 0
""")┌──────────────────────────────────────────────────────────────────────┐
│ count(DISTINCT COALESCE(cp.PRIMARY_GROUP_NUMBER, t.CUSTOMER_NUMBER)) │
│ int64 │
├──────────────────────────────────────────────────────────────────────┤
│ 1969 │
└──────────────────────────────────────────────────────────────────────┘